2. Word2Vec



In [ ]:

    
import gensim
from gensim.models.word2vec import Word2Vec
from sklearn.manifold import TSNE
import re
import matplotlib



In [ ]:

    
matplotlib.use("TkAgg")



In [ ]:

    
import matplotlib.pyplot as plt
%matplotlib inline

Dataset

Julius Caesar
Macbeth



In [ ]:

    
data_dir = '../data/'



In [ ]:

    
macbeth_file = data_dir + 'macbeth.txt'



In [ ]:

    
caesar_file = data_dir + 'julius_caesar.txt'

Remove the stopwords



In [ ]:

    
stopword_file = data_dir + 'long_stopwords.txt'



In [ ]:

    
stop_words = []

with open(stopword_file,'r') as inpFile:
    lines = inpFile.readlines()
    stop_words_temp = map(lambda x : re.sub('\n','',x),lines)
    stop_words = map(lambda x:  re.sub('[^A-Za-z0-9]+', '',x), stop_words_temp)



In [ ]:

    
stop_words



In [ ]:

    
type(stop_words)



In [ ]:

    
def clean(word):
    word = word.strip()
    word = word.lower()
    word = re.sub('[^A-Za-z0-9]+', '', word)
    if word not in stop_words:
        return word
    else:
        return ''



In [ ]:

    
clean("king's")



In [ ]:

    
clean("they'll")



In [ ]:

    
line_count = 0
sentences = []

with open(macbeth_file,'r') as inpFile:
    x = inpFile.readlines()
    for line in x:
        if line is not None or line != '\n':
            words = line.split()
            words = map(lambda x: clean(x), words)
            words = filter(lambda x:True if len(x) > 0 else False, words)
            sentences.append(words)
            
with open(caesar_file,'r') as inpFile:
    x = inpFile.readlines()
    for line in x:
        if line is not None or line != '\n':
            words = line.split()
            words = map(lambda x: clean(x), words)
            words = filter(lambda x:True if len(x) > 0 else False, words)
            sentences.append(words)



In [ ]:

    
type(sentences)

Word2Vec model



In [ ]:

    
model = Word2Vec(sentences, window=5, size=500, workers=4, min_count=5)



In [ ]:

    
model.vocab



In [ ]:

    
labels = []
tokens = []

for word in model.vocab:
    tokens.append(model[word])
    labels.append(word)

TSNE plot to find the similarity of words



In [ ]:

    
tsne_model = TSNE(perplexity=30, n_components=2, init='pca', n_iter=5000)



In [ ]:

    
new_values = tsne_model.fit_transform(tokens)



In [ ]:

    
x = []
y = []
for value in new_values:
    x.append(value[0])
    y.append(value[1])



In [ ]:

    
plt.figure(figsize=(16, 12)) 
for i in range(len(x)):
    plt.scatter(x[i],y[i])
    plt.annotate(labels[i],
                 xy=(x[i], y[i]),
                 xytext=(5, 2),
                 textcoords='offset points',
                 ha='right',
                 va='bottom')
plt.show()

Analogies



In [ ]:

    
model.most_similar(positive=['caesar','duncan'],negative=['scotland'])



In [ ]:

    
model.most_similar(positive=['caesar','duncan'],negative=['macbeth'])



In [ ]:

    
model.most_similar(positive=['caesar','macbeth'],negative=['banquo'])



In [ ]:

    
model.most_similar(positive=['rome','scotland'],negative=['banquo'])



In [ ]:

    
model.doesnt_match("duncan macbeth scotland banquo".split())



In [ ]: